# !pip install -q -U "tensorflow-text==2.8.*"
# !pip3 install pycocotools
# # conda install -c conda-forge pycocotools
# !pip install tf-models-official==2.7.1
# !pip install torch
# !pip install seaborn
# # !pip install jovian
import os
import shutil
import tensorflow as tf
# import tensorflow_hub as hub
# import tensorflow_text as text
# from official.nlp import optimization # to create AdamW optimizer
import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import csv
from sklearn.utils import shuffle
import tensorflow as tf
# import jovian
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib
device = torch.device('cpu')
data_path = '/Users/andraacsintoae/Desktop/MASTER/2nd year/Sem 2/InfoVisualization/Project/'
train_path = data_path + "netflix_titles.csv"
pd_train = pd.read_csv(train_path)
print(pd_train.head())
show_id type title director \
0 s1 TV Show 3% NaN
1 s2 Movie 7:19 Jorge Michel Grau
2 s3 Movie 23:59 Gilbert Chan
3 s4 Movie 9 Shane Acker
4 s5 Movie 21 Robert Luketic
cast country \
0 João Miguel, Bianca Comparato, Michel Gomes, R... Brazil
1 Demián Bichir, Héctor Bonilla, Oscar Serrano, ... Mexico
2 Tedd Chan, Stella Chung, Henley Hii, Lawrence ... Singapore
3 Elijah Wood, John C. Reilly, Jennifer Connelly... United States
4 Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar... United States
date_added release_year rating duration \
0 August 14, 2020 2020 TV-MA 4 Seasons
1 December 23, 2016 2016 TV-MA 93 min
2 December 20, 2018 2011 R 78 min
3 November 16, 2017 2009 PG-13 80 min
4 January 1, 2020 2008 PG-13 123 min
listed_in \
0 International TV Shows, TV Dramas, TV Sci-Fi &...
1 Dramas, International Movies
2 Horror Movies, International Movies
3 Action & Adventure, Independent Movies, Sci-Fi...
4 Dramas
description
0 In a future where the elite inhabit an island ...
1 After a devastating earthquake hits Mexico Cit...
2 When an army recruit is found dead, his fellow...
3 In a postapocalyptic world, rag-doll robots hi...
4 A brilliant group of students become card-coun...
pd_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7787 entries, 0 to 7786 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 7787 non-null object 1 type 7787 non-null object 2 title 7787 non-null object 3 director 5398 non-null object 4 cast 7069 non-null object 5 country 7280 non-null object 6 date_added 7777 non-null object 7 release_year 7787 non-null int64 8 rating 7780 non-null object 9 duration 7787 non-null object 10 listed_in 7787 non-null object 11 description 7787 non-null object dtypes: int64(1), object(11) memory usage: 730.2+ KB
pd_train.nunique()
show_id 7787 type 2 title 7787 director 4049 cast 6831 country 681 date_added 1565 release_year 73 rating 14 duration 216 listed_in 492 description 7769 dtype: int64
#pd_train.drop(columns = ['title','director','cast','country','date_added','release_year','rating','duration','listed_in'], axis =1 , inplace = True)
sns.heatmap(pd_train.isnull(), cbar=False)
plt.title('Null Values Heatmap')
plt.show()
pd_train.isnull().sum()
show_id 0 type 0 title 0 director 2389 cast 718 country 507 date_added 10 release_year 0 rating 7 duration 0 listed_in 0 description 0 dtype: int64
print('Number of training sentences: {:,}\n'.format(pd_train.shape[0]))
Number of training sentences: 7,787
pd_train.groupby('type').describe()
| release_year | ||||||||
|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | |
| type | ||||||||
| Movie | 5377.0 | 2012.920030 | 9.663282 | 1942.0 | 2012.0 | 2016.0 | 2018.0 | 2021.0 |
| TV Show | 2410.0 | 2016.191701 | 5.664826 | 1925.0 | 2015.0 | 2018.0 | 2019.0 | 2021.0 |
df_movie = pd_train[pd_train['type'] == 'Movie']
df_movie.shape
(5377, 12)
df_tvshow = pd_train[pd_train['type'] == 'TV Show']
df_tvshow.shape
(2410, 12)
df_movie.sample(2)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 527 | s528 | Movie | ANIMA | Paul Thomas Anderson | Thom Yorke | United Kingdom | June 27, 2019 | 2019 | TV-PG | 15 min | Dramas, Independent Movies, Music & Musicals | In a short musical film directed by Paul Thoma... |
| 1585 | s1586 | Movie | Dana Carvey: Straight White Male, 60 | Marcus Raboy | Dana Carvey | United States | November 4, 2016 | 2016 | TV-MA | 64 min | Stand-Up Comedy | Emmy-winning comedian Dana Carvey blends pitch... |
dataset_split_moovie = df_movie[pd_train['type']=='Movie'].copy()
dataset_split_moovie.head()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index. """Entry point for launching an IPython kernel.
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | s2 | Movie | 7:19 | Jorge Michel Grau | Demián Bichir, Héctor Bonilla, Oscar Serrano, ... | Mexico | December 23, 2016 | 2016 | TV-MA | 93 min | Dramas, International Movies | After a devastating earthquake hits Mexico Cit... |
| 2 | s3 | Movie | 23:59 | Gilbert Chan | Tedd Chan, Stella Chung, Henley Hii, Lawrence ... | Singapore | December 20, 2018 | 2011 | R | 78 min | Horror Movies, International Movies | When an army recruit is found dead, his fellow... |
| 3 | s4 | Movie | 9 | Shane Acker | Elijah Wood, John C. Reilly, Jennifer Connelly... | United States | November 16, 2017 | 2009 | PG-13 | 80 min | Action & Adventure, Independent Movies, Sci-Fi... | In a postapocalyptic world, rag-doll robots hi... |
| 4 | s5 | Movie | 21 | Robert Luketic | Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar... | United States | January 1, 2020 | 2008 | PG-13 | 123 min | Dramas | A brilliant group of students become card-coun... |
| 6 | s7 | Movie | 122 | Yasir Al Yasiri | Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed... | Egypt | June 1, 2020 | 2019 | TV-MA | 95 min | Horror Movies, International Movies | After an awful accident, a couple admitted to ... |
dataset_split_tv = df_tvshow[df_tvshow['type']=='TV Show'].copy()
dataset_split_tv.head()
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | TV Show | 3% | NaN | João Miguel, Bianca Comparato, Michel Gomes, R... | Brazil | August 14, 2020 | 2020 | TV-MA | 4 Seasons | International TV Shows, TV Dramas, TV Sci-Fi &... | In a future where the elite inhabit an island ... |
| 5 | s6 | TV Show | 46 | Serdar Akar | Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan... | Turkey | July 1, 2017 | 2016 | TV-MA | 1 Season | International TV Shows, TV Dramas, TV Mysteries | A genetics professor experiments with a treatm... |
| 11 | s12 | TV Show | 1983 | NaN | Robert Więckiewicz, Maciej Musiał, Michalina O... | Poland, United States | November 30, 2018 | 2018 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Dramas | In this dark alt-history thriller, a naïve law... |
| 12 | s13 | TV Show | 1994 | Diego Enrique Osorno | NaN | Mexico | May 17, 2019 | 2019 | TV-MA | 1 Season | Crime TV Shows, Docuseries, International TV S... | Archival video and new interviews examine Mexi... |
| 16 | s17 | TV Show | Feb-09 | NaN | Shahd El Yaseen, Shaila Sabt, Hala, Hanadi Al-... | NaN | March 20, 2019 | 2018 | TV-14 | 1 Season | International TV Shows, TV Dramas | As a psychology professor faces Alzheimer's, h... |
dataset_split_moovie = dataset_split_moovie.replace('', 'null').replace(' ', 'null').dropna(subset=["show_id","type","title","director","cast","country","description","date_added","rating","duration"])
print(dataset_split_moovie.head())
show_id type title director \
1 s2 Movie 7:19 Jorge Michel Grau
2 s3 Movie 23:59 Gilbert Chan
3 s4 Movie 9 Shane Acker
4 s5 Movie 21 Robert Luketic
6 s7 Movie 122 Yasir Al Yasiri
cast country \
1 Demián Bichir, Héctor Bonilla, Oscar Serrano, ... Mexico
2 Tedd Chan, Stella Chung, Henley Hii, Lawrence ... Singapore
3 Elijah Wood, John C. Reilly, Jennifer Connelly... United States
4 Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar... United States
6 Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed... Egypt
date_added release_year rating duration \
1 December 23, 2016 2016 TV-MA 93 min
2 December 20, 2018 2011 R 78 min
3 November 16, 2017 2009 PG-13 80 min
4 January 1, 2020 2008 PG-13 123 min
6 June 1, 2020 2019 TV-MA 95 min
listed_in \
1 Dramas, International Movies
2 Horror Movies, International Movies
3 Action & Adventure, Independent Movies, Sci-Fi...
4 Dramas
6 Horror Movies, International Movies
description
1 After a devastating earthquake hits Mexico Cit...
2 When an army recruit is found dead, his fellow...
3 In a postapocalyptic world, rag-doll robots hi...
4 A brilliant group of students become card-coun...
6 After an awful accident, a couple admitted to ...
dataset_split_moovie.duration = dataset_split_moovie.duration.str.replace(' min','').astype(int)
dataset_split_tv.rename(columns={'duration':'seasons'}, inplace=True)
dataset_split_tv.replace({'seasons':{'1 Season':'1 Seasons'}}, inplace=True)
dataset_split_tv.seasons = dataset_split_tv.seasons.str.replace(' Seasons','').astype(int)
It'd be interesting to see the comparison between the total number of movies and shows in this dataset just to get an idea of which one is the majority.
plt.figure(figsize=(7,5))
g = sns.countplot(pd_train.type, palette="pastel");
plt.title("Count of Movies and TV Shows")
plt.xlabel("Type (Movie/TV Show)")
plt.ylabel("Total Count")
plt.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
plt.figure(figsize=(12,6))
plt.title("% of Netflix Titles that are either Movies or TV Shows")
g = plt.pie(pd_train.type.value_counts(), explode=(0.025,0.025), labels=pd_train.type.value_counts().index, colors=['skyblue','navajowhite'],autopct='%1.1f%%', startangle=180);
plt.legend()
plt.show()
Now, we will explore the ratings which are based on the film rating system.
order = ['G', 'TV-Y', 'TV-G', 'PG', 'TV-Y7', 'TV-Y7-FV', 'TV-PG', 'PG-13', 'TV-14', 'R', 'NC-17', 'TV-MA']
plt.figure(figsize=(15,7))
g = sns.countplot(pd_train.rating, hue=pd_train.type, order=order, palette="pastel");
plt.title("Ratings for Movies & TV Shows")
plt.xlabel("Rating")
plt.ylabel("Total Count")
plt.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.countplot(pd_train.rating, order=order,palette="Set2", ax=ax[0]);
g1.set_title("Ratings for Movies")
g1.set_xlabel("Rating")
g1.set_ylabel("Total Count")
g2 = sns.countplot(pd_train.rating, order=order,palette="Set2", ax=ax[1]);
g2.set(yticks=np.arange(0,1600,200))
g2.set_title("Ratings for TV Shows")
g2.set_xlabel("Rating")
g2.set_ylabel("Total Count")
fig.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/ipykernel_launcher.py:11: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. # This is added back by InteractiveShellApp.init_path()
import plotly.express as px
rating_2021 = pd_train.copy()
rating_2021 = rating_2021.dropna()
fig = px.bar(rating_2021.query("release_year==2020"), x="rating", color="type", title="Ratings for Movies and TV Shows released in 2020")
fig.show()
pd_train['year_added'] = pd.DatetimeIndex(pd_train['date_added']).year
dataset_split_moovie['year_added'] = pd.DatetimeIndex(dataset_split_moovie['date_added']).year
dataset_split_tv['year_added'] = pd.DatetimeIndex(dataset_split_tv['date_added']).year
pd_train['month_added'] = pd.DatetimeIndex(pd_train['date_added']).month
dataset_split_moovie['month_added'] = pd.DatetimeIndex(dataset_split_moovie['date_added']).month
dataset_split_tv['month_added'] = pd.DatetimeIndex(dataset_split_tv['date_added']).month
Now we will take a look at the amount content Netflix has added throughout the previous years.
import plotly.graph_objects as go
netflix_year = pd_train['year_added'].value_counts().to_frame().reset_index().rename(columns={'index': 'year','year_added':'count'})
# netflix_year = netflix_year[netflix_year.year != 2020]
# sort by year
netflix_year = netflix_year.sort_values(by=['year'], axis=0)
dict_of_figure = dict({
"data": [{"type": "bar",
"x": netflix_year["year"],
"y": netflix_year["count"]}],
"layout": {"title": {"text": "Content added on Netflix each year between 2008 and 2021"}}
})
figure = go.Figure(dict_of_figure)
figure.show()
# create two dataframes, one for rows only containing Movie type and the other for rows only containing TV Shows type
dataset_movies = pd_train.apply(lambda values: values[pd_train['type'].isin(['Movie'])])
dataset_TV_shows = pd_train.apply(lambda values: values[pd_train['type'].isin(['TV Show'])])
Movie_month = dataset_movies['month_added'].value_counts().to_frame().reset_index().rename(columns={'index': 'month','month_added':'count'})
TVShows_month = dataset_TV_shows['month_added'].value_counts().to_frame().reset_index().rename(columns={'index': 'month','month_added':'count'})
# sort by month
Movie_month = Movie_month.sort_values(by=['month'], axis=0)
TVShows_month = TVShows_month.sort_values(by=['month'], axis=0)
from plotly.subplots import make_subplots
figure_month = make_subplots(rows=1, cols=2)
figure_month.add_trace(go.Bar(x = Movie_month["month"], y= Movie_month["count"]), row=1, col=1)
figure_month.add_trace(go.Scatter(x = Movie_month["month"], y= Movie_month["count"], mode="lines"), row=1, col=1)
figure_month.add_trace(go.Bar(x = TVShows_month["month"], y= TVShows_month["count"]), row=1, col=2)
figure_month.add_trace(go.Scatter(x = TVShows_month["month"], y= TVShows_month["count"], mode="lines"), row=1, col=2)
figure_month.layout.title = 'Trend of content released each month (left - Movies; right - TV Shows)'
figure_month.show()
month_year_df = pd_train.groupby('year_added')['month_added'].value_counts().unstack().fillna(0).T
plt.figure(figsize=(11,8))
sns.heatmap(month_year_df, linewidths=0.025, cmap="YlGnBu")
plt.title("Content Heatmap")
plt.ylabel("Month")
plt.xlabel("Year")
plt.show()
fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.distplot(dataset_split_moovie.duration, color='skyblue',ax=ax[0]);
g1.set_xticks(np.arange(0,360,30))
g1.set_title("Duration Distribution for Netflix Movies")
g1.set_ylabel("% of All Netflix Movies")
g1.set_xlabel("Duration (minutes)")
g2 = sns.countplot(dataset_split_tv.seasons, color='skyblue',ax=ax[1]);
g2.set_title("Netflix TV Shows Seasons")
g2.set_ylabel("Count")
g2.set_xlabel("Season(s)")
fig.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. /Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/ipykernel_launcher.py:11: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
import plotly.graph_objects as go
from ipywidgets import widgets
# dataset_split_tv.seasons
show_duration = dataset_split_tv.copy()
# dataset_split_moovie
show_duration.listed_in = show_duration.listed_in.str.split(', ', expand=True)[0]
show_duration['listed_in'].unique()
duration = widgets.IntSlider(
value=min(show_duration.seasons),
min=min(show_duration.seasons),
max=max(show_duration.seasons),
step=1.0,
description='Number of seasons:',
continuous_update=False)
interval = widgets.HBox(children=[duration])
dropdown = widgets.Dropdown(
options=list(show_duration['listed_in'].unique()),
value='International TV Shows',
description='Genre:',
)
# Assign an empty figure widget with two traces
trace = go.Histogram(x=show_duration['seasons'], opacity=0.95, name='Number of Series in TV Show')
h = go.FigureWidget(data=[trace])
def validate():
if dropdown.value in show_duration['listed_in'].unique():
return True
else:
return False
def response(change):
if validate():
duration_genre = [i and j for i, j in
zip(show_duration['seasons'] <= duration.value, show_duration['listed_in'] == dropdown.value)]
show_filtered = show_duration[duration_genre]
x = show_filtered['seasons']
with h.batch_update():
h.data[0].x = x
h.layout.xaxis.title = 'Number of seasons'
h.layout.yaxis.title = 'Number of TV Shows'
duration.observe(response, names="value")
dropdown.observe(response, names="value")
dropdown_container = widgets.HBox([dropdown])
widgets.VBox([interval,
dropdown_container,
h])
VBox(children=(HBox(children=(IntSlider(value=1, continuous_update=False, description='Number of seasons:', ma…
import plotly.graph_objects as go
from ipywidgets import widgets
movie_duration = dataset_split_moovie.copy()
# dataset_split_moovie
movie_duration.listed_in = movie_duration.listed_in.str.split(', ', expand=True)[0]
movie_duration['listed_in'].unique()
# ['Dramas', 'Horror Movies', 'Action & Adventure', 'Documentaries',
# 'Independent Movies', 'Comedies', 'Sci-Fi & Fantasy',
# 'International Movies', 'Children & Family Movies', 'Movies',
# 'Classic Movies', 'Thrillers', 'Stand-Up Comedy', 'Anime Features',
# 'Music & Musicals', 'Cult Movies', 'Romantic Movies',
# 'LGBTQ Movies']
duration = widgets.IntSlider(
value=min(movie_duration.duration),
min=min(movie_duration.duration),
max=max(movie_duration.duration),
step=1.0,
description='Duration (in minutes):',
continuous_update=False)
interval = widgets.HBox(children=[duration])
dropdown = widgets.Dropdown(
options=list(movie_duration['listed_in'].unique()),
value='Dramas',
description='Genre:',
)
# Assign an empty figure widget with two traces
trace = go.Histogram(x=movie_duration['duration'], opacity=0.95, name='Duration of movie')
g = go.FigureWidget(data=[trace])
def validate():
if dropdown.value in movie_duration['listed_in'].unique():
return True
else:
return False
def response(change):
if validate():
duration_genre = [i and j for i, j in
zip(movie_duration['duration'] <= duration.value, movie_duration['listed_in'] == dropdown.value)]
movie_filtered = movie_duration[duration_genre]
x = movie_filtered['duration']
with g.batch_update():
g.data[0].x = x
g.layout.barmode = 'overlay'
g.layout.xaxis.title = 'Duration (in minutes)'
g.layout.yaxis.title = 'Number of Movies'
duration.observe(response, names="value")
dropdown.observe(response, names="value")
dropdown_container = widgets.HBox([dropdown])
widgets.VBox([interval,
dropdown_container,
g])
VBox(children=(HBox(children=(IntSlider(value=8, continuous_update=False, description='Duration (in minutes):'…
import plotly.express as px
pd_train = pd_train.replace('', 'null').replace(' ', 'null').dropna(subset=["description"])
pd_train = pd_train.replace('', 'null').replace(' ', 'null').dropna(subset=["country"])
fig = px.scatter(pd_train, x = 'title', y = 'duration', color='type',title='Duration of Netflix movies and TV shows in each country by type')
fig.show()
filtered_countries = pd_train.set_index('title').country.str.split(', ', expand=True).stack().reset_index(level=1, drop=True);
filtered_countries = filtered_countries[filtered_countries != 'Country Unavailable']
plt.figure(figsize=(7,9))
g = sns.countplot(y = filtered_countries, order=filtered_countries.value_counts().index[:20])
plt.title('Top 20 Countries on Netflix')
plt.xlabel('Titles')
plt.ylabel('Country')
plt.show()
import plotly.express as px
dataset = pd_train.copy()
# drop columns from dataset that aren't required for the plot and rows with null values
country_distribution = dataset.drop(columns = {'show_id', 'director', 'cast', 'rating', 'description', 'listed_in', 'date_added', 'duration', 'year_added', 'month_added'}).dropna(axis = 0)
# get all unique countries on the dataset and then select top 20 countries
filtered_countries = pd_train.set_index('title').country.str.split(', ', expand=True).stack().reset_index(level=1, drop=True);
filtered_countries_top = filtered_countries.value_counts().index[:20]
# use lambda function in order to keep only rows with Top 20 content creating countries
country_distribution = country_distribution.apply(lambda values: values[country_distribution['country'].isin(filtered_countries_top)])
data = country_distribution
figure_countries = px.scatter(data, x="release_year", y="country", color="type", title="Overview of the time when top countries entered the Movie and TV Shows industry")
figure_countries.show()
filtered_genres = pd_train.set_index('title').listed_in.str.split(', ', expand=True).stack().reset_index(level=1, drop=True);
plt.figure(figsize=(7,9))
g = sns.countplot(y = filtered_genres, order=filtered_genres.value_counts().index[:20])
plt.title('Top 20 Genres on Netflix')
plt.xlabel('Count')
plt.ylabel('Genres')
plt.show()
import plotly.express as px
data = pd_train.copy()
figure_genres = px.scatter(data, x="release_year", y="listed_in", color="type", title="Genre overview for Netflix content by type")
figure_genres.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
COLORS = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:olive', 'tab:cyan', 'tab:gray', ]
MARKERS = ['o', 'v', 's', '<', '>', '8', '^', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']
# plotting 2D function
def plot2d(X, y_pred, y_true, mode=None, centroids=None):
transformer = None
X_r = X
if mode is not None:
transformer = mode(n_components=2)
X_r = transformer.fit_transform(X)
assert X_r.shape[1] == 2, 'plot2d only works with 2-dimensional data'
plt.grid()
for ix, iyp, iyt in zip(X_r, y_pred, y_true):
plt.plot(ix[0], ix[1],
c=COLORS[iyp],
marker=MARKERS[iyt])
if centroids is not None:
C_r = centroids
if transformer is not None:
C_r = transformer.fit_transform(centroids)
for cx in C_r:
plt.plot(cx[0], cx[1],
marker=MARKERS[-1],
markersize=10,
c='red')
plt.show()
# plotting 3D function
def plot3d(X, y_pred, y_true, mode=None, centroids=None):
transformer = None
X_r = X
if mode is not None:
transformer = mode(n_components=3)
X_r = transformer.fit_transform(X)
assert X_r.shape[1] == 3, 'plot3d only works with 3-dimensional data'
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.elev = 30
ax.azim = 120
for ix, iyp, iyt in zip(X_r, y_pred, y_true):
ax.plot(xs=[ix[0]], ys=[ix[1]], zs=[ix[2]], zdir='z',
c=COLORS[iyp],
marker=MARKERS[iyt])
if centroids is not None:
C_r = centroids
if transformer is not None:
C_r = transformer.fit_transform(centroids)
for cx in C_r:
ax.plot(xs=[cx[0]], ys=[cx[1]], zs=[cx[2]], zdir='z',
marker=MARKERS[-1],
markersize=10,
c='red')
plt.show()
# Date preparation function for Data and Labels split on both experiments
def data_preparation(dataset, experiment = "description_type"):
# 1. Type based on Description
if experiment == "description_type":
description_type = dataset.drop(columns = {'show_id', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'year_added', 'month_added'}).dropna(axis = 0)
# Type based on Description => 2 labels
unique_labels = description_type.type.unique()
# array(['TV Show', 'Movie'], dtype=object)
description_type = description_type.replace('TV Show', 0)
description_type = description_type.replace('Movie', 1)
Data = description_type.description
Labels = description_type.type
# 2. Genre based on Description
elif experiment == "description_genre":
description_genre = dataset.drop(columns = {'show_id', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'type', 'year_added', 'month_added'}).dropna(axis = 0)
# on Genre field we take only the first genre listed, which is the main one, and set it as the label
description_genre.listed_in = description_genre.listed_in.str.split(', ', expand=True)[0]
# 3. Genre based on Description => 36 labels
# ['International TV Shows' 'Dramas' 'Horror Movies' 'Action & Adventure'
# 'Crime TV Shows' 'Documentaries' 'Independent Movies' 'Comedies'
# 'Sports Movies' 'Anime Series' 'Reality TV' 'TV Comedies' 'Docuseries'
# 'Movies' 'British TV Shows' 'International Movies' 'Sci-Fi & Fantasy'
# "Kids' TV" 'Children & Family Movies' 'TV Shows' 'Classic Movies'
# 'Thrillers' 'Stand-Up Comedy & Talk Shows' 'Stand-Up Comedy'
# 'Anime Features' 'Music & Musicals' 'TV Dramas' 'TV Horror'
# 'TV Action & Adventure' 'Classic & Cult TV' 'Romantic TV Shows'
# 'Cult Movies' 'TV Sci-Fi & Fantasy' 'Romantic Movies'
# 'Spanish-Language TV Shows' 'LGBTQ Movies']
unique_genre = description_genre.listed_in.unique()
j = 0
for genre in unique_genre:
description_genre = description_genre.replace(genre, j)
j +=1
Data = description_genre.description
Labels = description_genre.listed_in
else:
print("Please set the experiment to either 'description_type' or 'description_genre'")
return Data, Labels
dataset_description = pd_train.copy()
Data, Labels = data_preparation(dataset_description, experiment = "description_type")
tfidf = TfidfVectorizer(token_pattern = r"(?u)\b\w\w+\b", stop_words="english")
Data_vect = tfidf.fit_transform(Data)
print(Data_vect.shape)
# print the TF-IDF Matrix
# Each word receives a weight computed based on its frequency on a single description and its frenquency overall.
# The more frequent the word is overall, the lower the weight, since its relevance drops on classification from an accuracy perspective
Data_vect = Data_vect.toarray()
print(Data_vect)
(7787, 17905) [[0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] ... [0. 0. 0. ... 0. 0. 0. ] [0.20814742 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ]]
# take a closer look to the vocabulary
# print(tfidf.vocabulary_)
# Displays the data distribution based on the labels.
plot2d(Data_vect, Labels, Labels, PCA)
plot3d(Data_vect, Labels, Labels, PCA)
training_data, test_data, training_labels, test_labels = train_test_split(Data_vect, Labels, test_size = 0.2, random_state=42)
# Check how balanced the dataset is
if experiment == "description_type":
print(f"Number of test data is {len(test_labels)} out of which {len(np.where(test_labels == 0)[0])} TV Shows and {len(np.where(test_labels == 1)[0])} Movies.")
if experiment == "description_genre":
print(f"Number of test data is {len(test_labels)} out of which {len(np.where(test_labels == 0)[0])} International TV Shows, {len(np.where(test_labels == 1)[0])} Dramas, {len(np.where(test_labels == 2)[0])} Horror Movies etc.")
else:
print("Please choose the experiment you'd like to perform.")
Number of test data is 1558 out of which 468 TV Shows and 1090 Movies.
classifier = RandomForestClassifier(random_state=0)
classifier.fit(training_data, training_labels)
test_predict = classifier.predict(test_data)
acc = accuracy_score(test_labels, test_predict)
print(f"The accuracy is {acc*100}.")
The accuracy is 37.869062901155324.
classifier = svm.SVC(C = 1) #where C is the regularization parameter; smaller values => stronger regularization but more computing time
classifier.fit(training_data, training_labels)
test_predict = classifier.predict(test_data)
acc = accuracy_score(test_labels, test_predict)
print(f"The accuracy is {acc*100}.")
The accuracy is 32.22079589216945.
classifier = LogisticRegression(penalty='l2', C = 100) #where C is the regularization parameter
classifier.fit(training_data, training_labels)
test_predict = classifier.predict(test_data)
acc = accuracy_score(test_labels, test_predict)
print(f"The accuracy is {acc*100}.")
The accuracy is 41.3992297817715.
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
Type based on Description Experiment
Type based on Description Experiment